Source code for nlp_architect.data.fasttext_emb

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import os
from six.moves import urllib
import numpy as np
from nlp_architect.utils.generic import license_prompt


[docs]class FastTextEmb: """ Downloads FastText Embeddings for a given language to the given path. Arguments: path(str): Local path to copy embeddings language(str): Embeddings language vocab_size(int): Size of vocabulary Returns: Returns a dictionary and reverse dictionary Returns a numpy array with embeddings in emb_sizexvocab_size shape """ def __init__(self, path, language, vocab_size, emb_dim=300): self.path = path self.language = language self.vocab_size = vocab_size self.emb_dim = emb_dim self.url = "https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki." + language + ".vec" def _maybe_download(self): """ Download filename from url unless it's already in directory """ # 1. Check if the file doesnt exist. Download and extract if it doesnt filename = "wiki." + self.language + ".vec" filepath = os.path.join(self.path, filename) link = "https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md" if not os.path.exists(filepath): if license_prompt(filepath, link, self.path): print( "Downloading FastText embeddings for " + self.language + " to " + filepath) urllib.request.urlretrieve(self.url, filepath) statinfo = os.stat(filepath) print( "Sucessfully downloaded", filename, statinfo.st_size, "bytes") else: exit() else: print( "Found FastText embeddings for " + self.language + " at " + filepath) return filepath
[docs] def read_embeddings(self, filepath): word2id = {} word_vec = [] with open(filepath) as emb_file: for i, line in enumerate(emb_file): # Line zero has total words, emb dimensions if i == 0: split_line = line.split() assert len(split_line) == 2 assert self.emb_dim == int(split_line[1]) # Rest of line are word, word_vec format else: word, vector = line.rstrip().split(' ', 1) vector = np.fromstring(vector, sep=' ') # If norm is zero fill with 0.01 if np.linalg.norm(vector) == 0: vector[0] = 0.01 assert vector.shape == (self.emb_dim, ), i # Assign a token word2id[word] = len(word2id) word_vec.append(vector[None]) # Check if your reached goal of vocab_size if i >= self.vocab_size: break # Reverse dictionary id2word = {v: k for k, v in word2id.items()} # Dictionary just combines both id2word and word2id into one dict dico = Dictionary(id2word, word2id, self.language) # All word_vectors word_vec = np.concatenate(word_vec, 0) # Normalize the embeddings return dico, word_vec
[docs] def load_embeddings(self): # Check if embeddings exist else download filepath = self._maybe_download() # Read embeddings dico, word_vec = self.read_embeddings(filepath) print("Completed loading embeddings for " + self.language) word_vec = np.float32(word_vec) return dico, word_vec
[docs]def get_eval_data(eval_path, src_lang, tgt_lang): """ Downloads evaluation cross lingual dictionaries to the eval_path Arguments: eval_path: Path where cross-lingual dictionaries are downloaded src_lang : Source Language tgt_lang : Target Language Returns: Path to where cross lingual dictionaries are downloaded """ eval_url = 'https://s3.amazonaws.com/arrival/dictionaries/' link = "https://github.com/facebookresearch/MUSE#ground-truth-bilingual-dictionaries" src_path = os.path.join(eval_path, '%s-%s.5000-6500.txt' % (src_lang, tgt_lang)) filename = src_lang + '-' + tgt_lang + '.5000-6500.txt' if not os.path.exists(src_path): if license_prompt(src_path, link, src_path): os.system("mkdir -p " + eval_path) print("Downloading cross-lingual dictionaries for " + src_lang) urllib.request.urlretrieve(eval_url + filename, src_path) print("Completed downloading to " + eval_path) else: exit() return src_path
[docs]class Dictionary: """ Merges word2idx and idx2word dictionaries Arguments: id2word dictionary word2id dictionary language of the dictionary Usage: dico.index(word) - returns an index dico[index] - returns the word """ def __init__(self, id2word, word2id, lang): assert len(id2word) == len(word2id) self.id2word = id2word self.word2id = word2id self.lang = lang self.check_valid() def __len__(self): """ Returns the number of words in the dictionary. """ return len(self.id2word) def __getitem__(self, i): """ Returns the word of the specified index. """ return self.id2word[i] def __contains__(self, w): """ Returns whether a word is in the dictionary. """ return w in self.word2id def __eq__(self, y): """ Compare the dictionary with another one. """ self.check_valid() y.check_valid() if len(self.id2word) != len(y): return False return self.lang == y.lang and all( self.id2word[i] == y[i] for i in range(len(y)))
[docs] def check_valid(self): """ Check that the dictionary is valid. """ assert len(self.id2word) == len(self.word2id) for i in range(len(self.id2word)): assert self.word2id[self.id2word[i]] == i
[docs] def index(self, word): """ Returns the index of the specified word. """ return self.word2id[word]